Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ape)
library(msa) ## need XVector, Biostrings. All through biocLite
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, regroup, slice
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
library(seqinr)
## 
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
## 
##     translate
## The following objects are masked from 'package:ape':
## 
##     as.alignment, consensus
## The following objects are masked from 'package:dplyr':
## 
##     count, query
library(RColorBrewer)

Load data

df = read.table("140117.cdr12.txt", header = T, sep = "\t", stringsAsFactors = F) %>%
  filter(species == "HomoSapiens", grepl("TR[AB].+\\*01", gene)) %>%
  dplyr::select(gene, seqaa, cdr1aa, cdr2aa) %>%
  mutate(chain = ifelse(grepl("TRA", gene), "TRA", "TRB"),
         gene = gsub("\\*01", "", gene))

df.prob = read.table("cdr.contact.txt", header = T, sep = "\t")

Multiple alignment of amino acid sequences

Set up plotting

df.trb = subset(df, chain == "TRB")

clrs = colorRampPalette(rev(brewer.pal(9, 'Paired')))(length(df.trb$gene))

seq = df.trb$seqaa
names(seq) = df.trb$gene

trbAln = msa(AAStringSet(seq))
## use default substitution matrix
print(trbAln)
## CLUSTAL 2.1  
## 
## Call:
##    msa(AAStringSet(seq))
## 
## MsaAAMultipleAlignment with 64 rows and 99 columns
##      aln                                              names
##  [1] DAEITQSPRHKITETGRQVTLAC...PLTLESAASSQTSVYFCASSE- TRBV10-1
##  [2] DAGITQSPRHKVTETGTPVTLRC...LLTLESATSSQTSVYFCAISE- TRBV10-3
##  [3] DAGITQSPRYKITETGRQVTLMC...PLTLESATRSQTSVYFCASSE- TRBV10-2
##  [4] NAGVTQTPKFRVLKTGQSMTLLC...LLGLESAAPSQTSVYFCASSY- TRBV6-2
##  [5] NAGVTQTPKFRVLKTGQSMTLLC...LLGLESAAPSQTSVYFCASSY- TRBV6-3
##  [6] NAGVTQTPKFQVLKTGQSMTLQC...SLRLESAAPSQTSVYFCASSE- TRBV6-1
##  [7] NAGVTQTPKFHILKTGQSMTLQC...PLRLVSAAPSQTSVYLCASSY- TRBV6-8
##  [8] NAGVTQTPKFHILKTGQSMTLQC...PLRLESAAPSQTSVYFCASSY- TRBV6-9
##  [9] NAGVTQTPKFQVLKTGQSMTLQC...PLRLLSAAPSQTSVYFCASSY- TRBV6-5 
##  ... ...
## [57] DARVTQTPRHKVTEMGQEVTMRC...TLKIQPSEPRDSAVYFCASGL- TRBV12-5
## [58] EAGVTQFPSHSVIEKGQTVTLRC...TLKVQPAELEDSGVYFCASSQ- TRBV14
## [59] EPEVTQTPSHQVTQMGQEVILRC...TLKIRSTKLEDSAMYFCASSE- TRBV2
## [60] SQTIHQWPATLVQPVGSPLSLEC...ILSSKKLLLSDSGFYLCAWS-- TRBV30
## [61] GAVVSQHPSWVICKSGTSVKIEC...TLTVTSAHPEDSSFYICSAR-- TRBV20-1
## [62] SAVVSQHPSRVICKSGTSVNIEC...ALTVTSAHPEDSSFYICSAR-- TRBV20/OR9-2
## [63] SAVISQKPSRDICQRGTSLTIQC...TLTVSNMSPEDSSIYLCSVE-- TRBV29-1
## [64] SAVISQKPSRDICQRGTSMMIQC...TLTVSNRRPEDSSIYLCSVE-- TRBV29/OR9-2
##  Con ?AGVTQ?P???V???GQ?VTL?C...?L????????DSA?Y?CASS?- Consensus
names(clrs) = names(trbAln@unmasked)

plot_tree = function(seqs, tip_colors = clrs) {
  trbAln = msa(AAStringSet(seq))
  
  tip_colors = tip_colors[names(trbAln@unmasked)]

  trbAln2 = msaConvert(trbAln, type="seqinr::alignment")
  d = dist.alignment(trbAln2, "identity")
  trbTree = nj(d)

  fig = c(0.1, 0.9, 0, 1.0)
  mar = c(0, 0, 0, 0)
  par(fig = fig, mar = mar, xpd = NA)#, bg = "grey")
  plot(trbTree, type = "fan",  tip.color = tip_colors)
  
  trbTree
}

Full V length

seq = df.trb$seqaa
names(seq) = df.trb$gene
full_phylo = plot_tree(seq)
## use default substitution matrix

CDR1

seq = df.trb$cdr1aa
names(seq) = df.trb$gene
cdr1_phylo = plot_tree(seq)
## use default substitution matrix

CDR2

seq = df.trb$cdr2aa
names(seq) = df.trb$gene
cdr2_phylo = plot_tree(seq)
## use default substitution matrix

CDR1+2

seq = paste0(df.trb$cdr1aa, df.trb$cdr2aa)
names(seq) = df.trb$gene
cdr12_phylo = plot_tree(seq)
## use default substitution matrix

Compare trees

fig = c(0.1, 0.9, 0, 1.0)
mar = c(0, 0, 0, 0)
par(fig = fig, mar = mar, xpd = NA, cex=0.3)

Full V alignment vs CDR1

cophyloplot(full_phylo, cdr1_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")

Full V alignment vs CDR2

cophyloplot(full_phylo, cdr2_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")

Full V alignment vs CDR1+2

cophyloplot(full_phylo, cdr12_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")

Computing number of possible antigen contacts

Compute contacts

pos_max = df.prob$pos_max[1]

compute_contacts = function(tcr_chain, tcr_region, region_seq) {
  df.seq = data.frame(tcr_chain = tcr_chain,
                      tcr_region = tcr_region,
                      aa_tcr = strsplit(region_seq, "")[[1]])
  
  df.seq$pos_rel_tcr = with(df.seq, (1:length(aa_tcr) - 1) / (length(aa_tcr) - 1) * (pos_max - 1))

  df.seq = merge(df.seq, df.prob)
  
  sum(df.seq$P)
}

df.cont = df %>%
  group_by(gene) %>%
  mutate(P = compute_contacts(chain, "CDR1", cdr1aa) + compute_contacts(chain, "CDR2", cdr2aa))
clrs2 = colorRampPalette(rev(brewer.pal(11, 'RdBu')))(101)

df.cont$Pl = rank(df.cont$P)
df.cont$Pind = with(df.cont, 100 * (Pl - min(Pl)) / (max(Pl) - min(Pl)) + 1)

clrs2 = clrs2[df.cont$Pind]
names(clrs2) = df.cont$gene
seq = df.trb$seqaa
names(seq) = df.trb$gene
full_phylo = plot_tree(seq, clrs2)
## use default substitution matrix

df.tra = subset(df, chain == "TRA")
seq = df.tra$seqaa
names(seq) = df.tra$gene
full_phylo = plot_tree(seq, clrs2)
## use default substitution matrix